## Project: CueAnon
## Authors: Benjamin S. Noble and Taylor N. Carlson
## Code to replicate observational text analysis
## - Table 1, A1, A3, A4

library(sandwich)
library(lmtest)
library(MASS)
library(Matching)
library(rgenoud)
library(quanteda)
library(caret)
library(caretEnsemble)
library(modelsummary)
library(xtable)
library(tidyverse)
options("modelsummary_format_numeric_latex" = "mathmode")

# Table A1, matching balance
# subset df with coded text data
t_mat <- read_csv('coded_candidates.csv') 
# full data of all collected 2020 candidates
fulldf <- read_csv('all_bp_candidates.csv') 

set.seed(326)
# genmatch weights
gen_out <- GenMatch(Tr = t_mat$qcand, 
    X = cbind(t_mat$female, t_mat$prev_off, t_mat$other, t_mat$republican, 
        t_mat$independent, t_mat$democratic, t_mat$senate, t_mat$ge_cand, 
        t_mat$incumbent, t_mat$open, t_mat$pop_density, t_mat$pviR, 
        t_mat$med_age, t_mat$pct_white, t_mat$pct_black, t_mat$pct_some_college, 
        t_mat$med_hhi), 
    print.level = 0,
    pop.size = 1000,
    wait.generations = 10)

gen_matched <- Match(Tr = t_mat$qcand, 
    X = cbind(t_mat$female, t_mat$prev_off, t_mat$other, t_mat$republican, 
        t_mat$independent, t_mat$democratic, t_mat$senate, t_mat$ge_cand, 
        t_mat$incumbent, t_mat$open, t_mat$pop_density, t_mat$pviR, 
        t_mat$med_age, t_mat$pct_white, t_mat$pct_black, t_mat$pct_some_college, 
        t_mat$med_hhi), 
    Weight = 3,
    Weight.matrix = gen_out$Weight.matrix)

# matching statistics used in Table A1
# full set
set.seed(23)
full_sample_balance <- MatchBalance(qcand ~ female + prev_off + other + 
    republican + independent + democratic + senate + ge_cand + incumbent + open 
    + pop_density + pviR + med_age + pct_white + pct_black + pct_some_college 
    + med_hhi, fulldf, print.level = 1)

# after second matching
set.seed(23)
match_bal <- MatchBalance(qcand ~ female + prev_off + other + republican + 
    independent + democratic + senate + ge_cand + incumbent + open + pop_density
    + pviR + med_age + pct_white + pct_black + pct_some_college + med_hhi, 
    t_mat, match.out = gen_matched, print.level = 1)

# extract pre-matching balance
q_supp <- non_supp <- pv <- c()
for(i in 1:length(full_sample_balance[[1]])){
    q_supp <- c(q_supp, full_sample_balance[[1]][[i]]$mean.Tr)
    non_supp <- c(non_supp, full_sample_balance[[1]][[i]]$mean.Co)
    if(is.null(full_sample_balance[[1]][[i]]$ks$ks.boot.pvalue)){
        pv <- c(pv, full_sample_balance[[1]][[i]]$tt$p.value)
    } else {
        pv <- c(pv, full_sample_balance[[1]][[i]]$ks$ks.boot.pvalue)
    }
}
# extract post-matching balance
q_supp1 <- non_supp1 <- pv1 <- c()
for(i in 1:length(match_bal[[2]])){
    q_supp1 <- c(q_supp1, match_bal[[2]][[i]]$mean.Tr)
    non_supp1 <- c(non_supp1, match_bal[[2]][[i]]$mean.Co)
    if(is.null(match_bal[[2]][[i]]$ks$ks.boot.pvalue)){
        pv1 <- c(pv1, match_bal[[2]][[i]]$tt$p.value)
    } else {
        pv1 <- c(pv1, match_bal[[2]][[i]]$ks$ks.boot.pvalue)
    }
}
# labels and order for table A1
lab_df <- tibble(labs = c('female', 'prev_off', 'other', 'republican', 'independent', 
            'democratic', 'senate', 'ge_cand', 'incumbent', 'open', 'pop_density', 
            'pviR', 'med_age', 'pct_white', 'pct_black', 'pct_some_college', 'med_hhi'),
        ord = c(1, 2, 9, 6, 8, 7, 3, 4, 5, 10:17)) 

# create dfs of values
pre_df <- tibble(round(q_supp, 2), round(non_supp, 2), round(pv, 2))
post_df <- tibble(round(q_supp1, 2), round(non_supp1, 2), round(pv1, 2))
# create table A1
xtable(bind_cols(lab_df, pre_df, post_df) %>% arrange(ord) %>% select(-ord))

# Table 1/A3, matching regression results
# create matched df
newmatch <- bind_rows(t_mat[gen_matched$index.treated,], 
    t_mat[gen_matched$index.control,])
# create weights and ids
new_mat2 <- bind_cols(newmatch, 
    gmw = rep(gen_matched$weights,2), 
    ids = c(gen_matched$index.treated, gen_matched$index.control), 
    uid = rep(1:(nrow(newmatch)/2),2)) 

# how many people received any coverage at all, by group
new_mat2 %>% 
    filter(total_n > 0) %>% 
    select(name, qcand) %>%
    unique() %>%  
    group_by(qcand) %>% 
    summarise(n = n())

# models for Table 1/Table A3
mod_tot <- glm.nb(total_n ~ qcand, new_mat2, weights = gmw)
vcv_tot <- vcovHC(mod_tot, 'HC3')
coeftest(mod_tot, vcv_tot) # Table A3, left column
# difference in articles
nb_prtot <- predict(mod_tot, newdata = tibble(qcand = c(0,1)), type = 'response')
nb_prtot[2] - nb_prtot[1]

mod_neg <- glm.nb(n_neg ~ qcand, new_mat2, weights = gmw)
vcv_neg <- vcovHC(mod_neg, 'HC3')
# Table A3, right column
coeftest(mod_neg, vcv_neg) 
# difference in articles
nb_pr <- predict(mod_neg, newdata = tibble(qcand = c(0,1)), type = 'response')
nb_pr[2] - nb_pr[1]
# number of unique observations
new_mat2 %>% filter(qcand == 1) %>% select(name) %>% unique
new_mat2 %>% filter(qcand == 0) %>% select(name) %>% unique

# --------------------------------------------------------------------------- #

# Table A4, regression with double coded sample
# read in double coded df
t_mat <- read_csv('ra_agree_negative_codes.csv') 
 # genetic matching
set.seed(326)
# genmatch weights
gen_out <- GenMatch(Tr = t_mat$qcand, 
    X = cbind(t_mat$female, t_mat$prev_off, t_mat$other, t_mat$republican, 
        t_mat$independent, t_mat$democratic, t_mat$senate, t_mat$ge_cand, 
        t_mat$incumbent, t_mat$open, t_mat$pop_density, t_mat$pviR, 
        t_mat$med_age, t_mat$pct_white, t_mat$pct_black, t_mat$pct_some_college, 
        t_mat$med_hhi), 
    print.level = 0,
    pop.size = 1000,
    wait.generations = 10)

gen_matched <- Match(Tr = t_mat$qcand, 
    X = cbind(t_mat$female, t_mat$prev_off, t_mat$other, t_mat$republican, 
        t_mat$independent, t_mat$democratic, t_mat$senate, t_mat$ge_cand, 
        t_mat$incumbent, t_mat$open, t_mat$pop_density, t_mat$pviR, 
        t_mat$med_age, t_mat$pct_white, t_mat$pct_black, t_mat$pct_some_college, 
        t_mat$med_hhi), 
    Weight = 3,
    Weight.matrix = gen_out$Weight.matrix)

# get matched df
newmatch <- bind_rows(t_mat[gen_matched$index.treated,], 
    t_mat[gen_matched$index.control,])

# create df with weights/weighted matches and merge with candidate info
new_mat2 <- bind_cols(newmatch, 
    gmw = rep(gen_matched$weights,2), 
    ids = c(gen_matched$index.treated, gen_matched$index.control), 
    uid = rep(1:(nrow(newmatch)/2),2)) 

# run models for Table A4
# left column
mod_tot <- glm.nb(total_n ~ qcand, new_mat2, weights = gmw)
vcv_tot <- vcovHC(mod_tot, 'HC3')
coeftest(mod_tot, vcv_tot)

# right column
mod_neg <- glm.nb(n_neg ~ qcand, new_mat2, weights = gmw)
vcv_neg <- vcovHC(mod_neg, 'HC3')
coeftest(mod_neg, vcv_neg)

# unique candidates
new_mat2 %>% filter(qcand == 1) %>% select(name) %>% unique
new_mat2 %>% filter(qcand == 0) %>% select(name) %>% unique


